1 /*
2 * Copyright (C) 2013 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5 * in compliance with the License. You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software distributed under the License
10 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11 * or implied. See the License for the specific language governing permissions and limitations under
12 * the License.
13 */
14
15 package com.google.common.base;
16
17 import static com.google.common.base.Preconditions.checkPositionIndexes;
18
19 import com.google.common.annotations.Beta;
20 import com.google.common.annotations.GwtCompatible;
21
22 /**
23 * Low-level, high-performance utility methods related to the {@linkplain Charsets#UTF_8 UTF-8}
24 * character encoding. UTF-8 is defined in section D92 of
25 * <a href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode Standard Core
26 * Specification, Chapter 3</a>.
27 *
28 * <p>The variant of UTF-8 implemented by this class is the restricted definition of UTF-8
29 * introduced in Unicode 3.1. One implication of this is that it rejects
30 * <a href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest form"</a> byte
31 * sequences, even though the JDK decoder may accept them.
32 *
33 * @author Martin Buchholz
34 * @author Clément Roux
35 * @since 16.0
36 */
37 @Beta
38 @GwtCompatible
39 public final class Utf8 {
40 /**
41 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
42 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
43 * both time and space.
44 *
45 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
46 * surrogates)
47 */
48 public static int encodedLength(CharSequence sequence) {
49 // Warning to maintainers: this implementation is highly optimized.
50 int utf16Length = sequence.length();
51 int utf8Length = utf16Length;
52 int i = 0;
53
54 // This loop optimizes for pure ASCII.
55 while (i < utf16Length && sequence.charAt(i) < 0x80) {
56 i++;
57 }
58
59 // This loop optimizes for chars less than 0x800.
60 for (; i < utf16Length; i++) {
61 char c = sequence.charAt(i);
62 if (c < 0x800) {
63 utf8Length += ((0x7f - c) >>> 31); // branch free!
64 } else {
65 utf8Length += encodedLengthGeneral(sequence, i);
66 break;
67 }
68 }
69
70 if (utf8Length < utf16Length) {
71 // Necessary and sufficient condition for overflow because of maximum 3x expansion
72 throw new IllegalArgumentException("UTF-8 length does not fit in int: "
73 + (utf8Length + (1L << 32)));
74 }
75 return utf8Length;
76 }
77
78 private static int encodedLengthGeneral(CharSequence sequence, int start) {
79 int utf16Length = sequence.length();
80 int utf8Length = 0;
81 for (int i = start; i < utf16Length; i++) {
82 char c = sequence.charAt(i);
83 if (c < 0x800) {
84 utf8Length += (0x7f - c) >>> 31; // branch free!
85 } else {
86 utf8Length += 2;
87 // jdk7+: if (Character.isSurrogate(c)) {
88 if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
89 // Check that we have a well-formed surrogate pair.
90 int cp = Character.codePointAt(sequence, i);
91 if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
92 throw new IllegalArgumentException("Unpaired surrogate at index " + i);
93 }
94 i++;
95 }
96 }
97 }
98 return utf8Length;
99 }
100
101 /**
102 * Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte sequence according to
103 * Unicode 6.0. Note that this is a stronger criterion than simply whether the bytes can be
104 * decoded. For example, some versions of the JDK decoder will accept "non-shortest form" byte
105 * sequences, but encoding never reproduces these. Such byte sequences are <i>not</i> considered
106 * well-formed.
107 *
108 * <p>This method returns {@code true} if and only if {@code Arrays.equals(bytes, new
109 * String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both time and space.
110 */
111 public static boolean isWellFormed(byte[] bytes) {
112 return isWellFormed(bytes, 0, bytes.length);
113 }
114
115 /**
116 * Returns whether the given byte array slice is a well-formed UTF-8 byte sequence, as defined by
117 * {@link #isWellFormed(byte[])}. Note that this can be false even when {@code
118 * isWellFormed(bytes)} is true.
119 *
120 * @param bytes the input buffer
121 * @param off the offset in the buffer of the first byte to read
122 * @param len the number of bytes to read from the buffer
123 */
124 public static boolean isWellFormed(byte[] bytes, int off, int len) {
125 int end = off + len;
126 checkPositionIndexes(off, end, bytes.length);
127 // Look for the first non-ASCII character.
128 for (int i = off; i < end; i++) {
129 if (bytes[i] < 0) {
130 return isWellFormedSlowPath(bytes, i, end);
131 }
132 }
133 return true;
134 }
135
136 private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) {
137 int index = off;
138 while (true) {
139 int byte1;
140
141 // Optimize for interior runs of ASCII bytes.
142 do {
143 if (index >= end) {
144 return true;
145 }
146 } while ((byte1 = bytes[index++]) >= 0);
147
148 if (byte1 < (byte) 0xE0) {
149 // Two-byte form.
150 if (index == end) {
151 return false;
152 }
153 // Simultaneously check for illegal trailing-byte in leading position
154 // and overlong 2-byte form.
155 if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
156 return false;
157 }
158 } else if (byte1 < (byte) 0xF0) {
159 // Three-byte form.
160 if (index + 1 >= end) {
161 return false;
162 }
163 int byte2 = bytes[index++];
164 if (byte2 > (byte) 0xBF
165 // Overlong? 5 most significant bits must not all be zero.
166 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
167 // Check for illegal surrogate codepoints.
168 || (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2)
169 // Third byte trailing-byte test.
170 || bytes[index++] > (byte) 0xBF) {
171 return false;
172 }
173 } else {
174 // Four-byte form.
175 if (index + 2 >= end) {
176 return false;
177 }
178 int byte2 = bytes[index++];
179 if (byte2 > (byte) 0xBF
180 // Check that 1 <= plane <= 16. Tricky optimized form of:
181 // if (byte1 > (byte) 0xF4
182 // || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90
183 // || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
184 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
185 // Third byte trailing-byte test
186 || bytes[index++] > (byte) 0xBF
187 // Fourth byte trailing-byte test
188 || bytes[index++] > (byte) 0xBF) {
189 return false;
190 }
191 }
192 }
193 }
194
195 private Utf8() {}
196 }